2. Estimation
- Import the data and name the column.
- I add
id column, which indicates each individual.
individual <- read_csv("../input/data/Data2020GIOPS2.csv",
col_names = c("choice", "age", "gender"))
Parsed with column specification:
cols(
choice = col_double(),
age = col_double(),
gender = col_double()
)
individual <- individual %>%
mutate(id = 1:NROW(individual)) %>%
select(id, everything()) # order the columns
N <- NROW(individual)
individual
- Next, I compute the mean utility for each function.
- To do so, I make a data frame
X that contains product characteristics, where rows correspond to alternatives and columns to dimensions of characteristics.
- I add the outside option row.
X <- tibble(
j = 0:3,
x_1 = c(0, 1, 1.2, 1.4),
x_2 = c(0, 5, 3.5, 2)
)
X
- Using this, I merge
X and individual.
- I make
eps variable, which is \(\varepsilon_{ij}\) following Gumbel distribution.
- The variable
y is the same as the lecture note: \(y_{ij}=1\) if \(i\) chooses \(j\)
HW2_df_join can be found here
match <- tibble(
id = rep(1:N, each = 4),
j = rep(0:3, times = N)
)
df <- HW2_df_join(individual, X, match) %>%
mutate(eps = evd::rgumbel(NROW(match)))
df
- Then, I write a function to compute the mean utility,
delta, and the random utility, u.
compute_mean_utility <- function(beta, df){
df_u <- df %>%
mutate(delta = beta[1] + beta[2] * age * x_1 + beta[3] * gender * x_2) %>%
mutate(u = delta + eps) %>%
mutate(delta = if_else(j == 0, 0, delta),
u = if_else(j == 0, 0, u))
return(df_u)
}
compute_mean_utility(c(1,2,3), df)
- Then, I write a function to compute the choice probabilies.
compute_choice_prob <- function(beta, df){
df_u <- compute_mean_utility(beta, df)
df_choice <- df_u %>%
mutate(exp_delta = exp(delta)) %>%
group_by(id) %>%
mutate(sum_exp_delta = sum(exp_delta),
prob_choice = exp_delta / sum_exp_delta) %>%
ungroup()
return(df_choice)
}
compute_choice_prob(c(1,2,3), df)
- Finally, I compute the (log-) likelihood.
likelihood <- function(beta, df){
lkhd <- compute_choice_prob(beta, df) %>%
mutate(log_prob_choice = log(prob_choice)) %>%
mutate(y_log_prob_choice = y * log_prob_choice) %>%
pull(y_log_prob_choice)
return(sum(lkhd))
}
likelihood(c(1,2,3), df)
[1] -171477.5
- With log likehood functioin at hand, we can optimize it.
beta_init <- c(-1, 1, 0.5) # initial guess
optim_result <- optim(beta_init, likelihood, df = df, control=list(fnscale = -1), method = "Nelder-Mead")
optim_result
$par
[1] -1.88642927 0.09716284 -1.02709908
$value
[1] -8361.77
$counts
function gradient
174 NA
$convergence
[1] 0
$message
NULL
3. Standard Errors
- I calculate the standard errors using bootstrap.
S <- 1000
result <- array(dim = c(3, S))
beta_init <- c(-2, 0.1, -1)
eps <- evd::rgumbel(NROW(match)) # random shocks should be fixed outside the simulation loop
for(i in 1:S){
random_id <- sample(N, N, replace = TRUE)
match_bs <- tibble(
id = rep(random_id, each = 4),
j = rep(0:3, times = N),
eps = eps
)
df_bs <- HW2_df_join(individual, X, match_bs)
result[,i] <- optim(beta_init, likelihood, df = df_bs, method = "Nelder-Mead")$par
}
se_1 <- sd(result[1,])
se_2 <- sd(result[2,])
se_3 <- sd(result[3,])
paste("standard error for beta_0 is ", round(se_1, 3))
[1] "standard error for beta_0 is 0.414"
paste("standard error for beta_1 is ", round(se_2, 3))
[1] "standard error for beta_1 is 0.046"
paste("standard error for beta_2 is ", round(se_3, 3))
[1] "standard error for beta_2 is 1.734"